In [ ]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle
import time

prep 1: melt and append


In [ ]:
# load full data
trn = pd.read_csv('../input/train_ver2.csv')
tst = pd.read_csv('../input/test_ver2.csv')
labels = pd.read_csv('../input/labels.csv').astype(int)

# prepare lag data
trn_dates = ['2015-01-28','2015-02-28','2015-03-28','2015-04-28','2015-05-28']
tst_dates = ['2016-01-28','2016-02-28','2016-03-28','2016-04-28','2016-05-28']

temp = trn[trn['fecha_dato'] == '2015-06-28']['ncodpers']
trn_ncodpers = temp[(labels[trn['fecha_dato'] == '2015-06-28'].sum(axis=1) > 0).values].values.tolist()
tst_ncodpers = np.unique(tst['ncodpers']).tolist()

trn_trim = trn[trn['fecha_dato'].isin(trn_dates)]
trn_trim = trn_trim[trn_trim['ncodpers'].isin(trn_ncodpers)]
tst_trim = trn[trn['fecha_dato'].isin(tst_dates)]
tst_trim = tst_trim[tst_trim['ncodpers'].isin(tst_ncodpers)]

# melt labels for trn

fecha_dato = trn['fecha_dato']
train_index = (labels[fecha_dato == '2015-06-28'].sum(axis=1) > 0)
train_index = train_index[train_index == True]
train = trn.ix[train_index.index]
train.iloc[:,24:] = labels.ix[train_index.index]

trn_june = []
for ind, (run, row) in enumerate(train.iterrows()):
    for i in range(24):
        if row[24+i] == 1:
            temp = row[:24].values.tolist()
            temp.append(i)
            trn_june.append(temp)
            
# define and save target separately
target = pd.DataFrame(trn_june)[24].values.tolist()
target = pd.DataFrame(target)
print('# target shape : ({})'.format(len(target)))

# make full data set 
trn_june = pd.DataFrame(trn_june, columns=trn.columns[:25]).iloc[:,:-1]
trn = pd.concat([trn_trim, trn_june], axis=0)
tst = pd.concat([tst_trim, tst], axis=0)
print(trn.shape, tst.shape)

prep 2: Label Encode


In [ ]:
# clean data
skip_cols = ['fecha_dato','ncodpers']
target_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',
               'ind_cder_fin_ult1', 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1',
               'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 'ind_ctpp_fin_ult1',
               'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
               'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',
               'ind_plan_fin_ult1', 'ind_pres_fin_ult1', 'ind_reca_fin_ult1',
               'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 'ind_viv_fin_ult1',
               'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']

for col in trn.columns:
    if col in skip_cols:
        continue
    
    if col == 'ind_empleado':
        trn[col].fillna('S', inplace=True)
    elif col == 'age':
        trn[col].replace(' NA',0,inplace=True)
        trn[col] = trn[col].astype(str).astype(int)
        trn[col] = trn[col].astype(str).astype(int)
        continue
    elif col == 'fecha_alta':
        trn[col] = ((pd.to_datetime(trn['fecha_dato']) - pd.to_datetime(trn[col].fillna('2015-07-01')))/ np.timedelta64(1, 'D')).astype(int)
        tst[col] = ((pd.to_datetime(tst['fecha_dato']) - pd.to_datetime(tst[col]))/np.timedelta64(1, 'D')).astype(int)
        continue
    elif col == 'antiguedad':
        trn[col].replace('     NA',-1,inplace=True)
        trn[col] = trn[col].astype(str).astype(int)
        tst[col] = tst[col].astype(str).astype(int)
        continue
    elif col == 'ult_fec_cli_1t':
        trn[col] = ((pd.to_datetime(trn['fecha_dato']) - pd.to_datetime(trn[col].fillna('2015-06-30')))/ np.timedelta64(1, 'D')).astype(int)
        tst[col] = ((pd.to_datetime(tst['fecha_dato']) - pd.to_datetime(tst[col].fillna('2016-01-03')))/np.timedelta64(1, 'D')).astype(int)
        continue
    elif col == 'indrel_1mes':
        tst[col].replace('1','1.0',inplace=True)
        tst[col].replace('2','1.0',inplace=True)
        tst[col].replace('2.0','1.0',inplace=True)
        tst[col].replace(2.0,'1.0',inplace=True)
        tst[col].replace('3','3.0',inplace=True)
        tst[col].replace('4','3.0',inplace=True)
        tst[col].replace(4.0,'3.0',inplace=True)
        tst[col].replace('4.0','3.0',inplace=True)
        tst[col].replace('P','3.0',inplace=True)
    elif col == 'tiprel_1mes':
        tst[col].replace('N','I',inplace=True)
        tst[col].replace('R','P',inplace=True)
    elif col == 'indresi':
        trn[col].fillna('N',inplace=True)
    elif col == 'indext':
        trn[col].fillna('S',inplace=True)
    elif col == 'indfall':
        trn[col].fillna('N',inplace=True)
    elif col == 'tipodom':
        trn.drop([col], axis=1, inplace=True)
        tst.drop([col], axis=1, inplace=True)
        continue
    elif col == 'ind_actividad_cliente':
        trn[col].fillna(0.0, inplace=True)
    elif col == 'renta':
        tst[col].replace('         NA',0,inplace=True)
        trn[col].fillna(-1, inplace=True)
        tst[col].fillna(-1, inplace=True)
        trn[col] = trn[col].astype(str).astype(float).astype(int)
        tst[col] = tst[col].astype(str).astype(float).astype(int)
        continue
    elif col in target_cols:
        trn[col].fillna(0, inplace=True)
        trn[col] = trn[col].astype(int)
        tst[col].fillna(0, inplace=True)
        tst[col] = tst[col].astype(int)
        
    lb = LabelEncoder()
    lb.fit(pd.concat([trn[col].astype(str), tst[col].astype(str)], axis=0))
    trn[col] = lb.transform(trn[col].astype(str))
    tst[col] = lb.transform(tst[col].astype(str))

prep 3: append lag

- for each ncodpers in June data, append Jan~May (48x5) to single June data

In [ ]:
trn_june = trn[trn['fecha_dato'] == '2015-06-28'].drop(target_cols, axis=1)
trn_othr = trn[trn['fecha_dato'] != '2015-06-28']
tst_june = tst[tst['fecha_dato'] == '2016-06-28'].drop(target_cols, axis=1)
tst_othr = tst[tst['fecha_dato'] != '2016-06-28']

In [ ]:
st = time.time()
drop_cols = ['fecha_dato','ncodpers']

print('# Appending trn data.. {} rows'.format(trn_june.shape[0]))
trn_append = []
for i, ncodper in enumerate(trn_june['ncodpers']):
    temp = trn_othr[trn_othr['ncodpers'] == ncodper].drop(drop_cols, axis=1)
    if temp.shape[0] == 0:
        row = ['NA']*225
    else:
        row = np.hstack([temp.shift(periods=i).iloc[-1,:] for i in range(temp.shape[0])]).tolist()
    trn_append.append(trn_june.iloc[i].drop(drop_cols).values.tolist() + row)
    
    if i % int(trn_june.shape[0]/10) == 0:
        print('# {} rows.. {} secs..'.format(i, round(time.time() - st),2))

st = time.time()
print('# Appending tst data.. {} rows'.format(tst_june.shape[0]))
tst_append = []
for i, ncodper in enumerate(tst_june['ncodpers']):
    temp = tst_othr[tst_othr['ncodpers'] == ncodper].drop(drop_cols, axis=1)
    if temp.shape[0] == 0:
        row = ['NA']*225
    else:
        row = np.hstack([temp.shift(periods=i).iloc[-1,:] for i in range(temp.shape[0])]).tolist()
    tst_append.append(tst_june.iloc[i].drop(drop_cols).values.tolist() + row)
    
    if i % int(tst_june.shape[0]/10) == 0:
        print('# {} rows.. {} secs..'.format(i, round(time.time() - st),2))

# 150 secs for trn
# 9964 secs for tst

In [ ]:
colnames = trn_june.drop(drop_cols, axis=1).columns.values.tolist()
suffixes = ['_lag_one','_lag_two','_lag_thr','_lag_fou','_lag_fiv']
for suffix in suffixes:
    for col in trn_othr.drop(drop_cols, axis=1).columns.values.tolist():
        colnames.append(col+suffix)
print(len(colnames))

# initialize column names
trn = pd.DataFrame(trn_append, columns=colnames)
tst = pd.DataFrame(tst_append, columns=colnames)
print('# trn : {} | tst : {}'.format(trn.shape, tst.shape))

In [ ]:
trn.to_csv('../input/train_append_lb_lag.csv', index=False)
tst.to_csv('../input/test_append_lb_lag.csv', index=False)